1. IMPORT


library(dada2)
packageVersion("dada2") # check dada2 version
## [1] '1.22.0'
library(Biostrings)
library(ShortRead)
library(seqTools) # per base sequence content
library(phyloseq)
library(ggplot2)
library(data.table)
library(plyr)
library(dplyr)
library(qckitfastq) # per base sequence content
library(stringr)

# ROOT DIRECTORY (to modify on your computer)
path.root <- "~/Projects/MetaIBS"
path.nagel <- file.path(path.root, "scripts/analysis-individual/Nagel-2016")
path.data  <- file.path(path.root, "data/analysis-individual/Nagel-2016")

2. QUALITY CHECK


2.1. Fastq quality profiles

First, we import the fastq files containing the raw reads. The samples were kindly shared by the author.

# Save the path to the directory containing the fastq zipped files
path.fastq <- file.path(path.data, "raw_fastq")
# list.files(path.fastq) # check we are in the right directory

# fastq filenames have format: SAMPLENAME.fastq
# Saves the whole directory path to each file name
FNs <- sort(list.files(path.fastq, pattern=".fastq", full.names = TRUE))
show(FNs[1:5])

# Extract sample names, assuming filenames have format: SAMPLENAME.fastq
sample.names <- sapply(strsplit(basename(FNs), "_"), `[`, 1)
show(sample.names) # saves only the file name (without the path)

# After demultiplexing, sanity check of read length
seqlen.tab <- table(width(readFastq(FNs[1])))
plot(x=as.integer(names(seqlen.tab)), y=log10(as.integer(seqlen.tab)))

table(width(readFastq(FNs[1]))==0) # some reads are of length 0 => need to remove them

Some reads are of length 0. To avoid having issues with looking at the quality profile of the samples, we will first remove all reads below 50bp.

# Place filtered reads in a original_trim/ subdirectory
trim_samples <- file.path(path.data, "raw_fastq_trimmed", paste0(sample.names, "_trim.fastq.gz"))
names(trim_samples) <- sample.names # assign names

# Remove reads with less than 50bp
trim <- filterAndTrim(fwd = FNs, filt = trim_samples,
                     truncQ=0, # default truncQ is 2
                     minLen = 50, # Discard reads shorter than 50 bp.
                     compress=TRUE,
                     verbose=TRUE)
# ~ 85-98% sequences filtered

Now we can look at the quality profile of the samples.

# Look at quality of all files
for (i in 1:4){ # 1:length(trim_samples)
  show(plotQualityProfile(trim_samples[i]))
}

# Look at nb of reads per sample
# raw_stats <- data.frame('sample' = sample.names,
#                         'reads' = fastqq(trim_samples)@nReads)
# min(raw_stats$reads)
# max(raw_stats$reads)
# mean(raw_stats$reads)

We will have a quick peak at the per base sequence content of the reads in some samples, to make sure there is no anomaly.

# Look at per base sequence content
fseq <- seqTools::fastqq(trim_samples[1])
## [fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN1_trim.fastq.gz'    done.
rc <- read_content(fseq)
plot_read_content(rc) + labs(title = "Per base sequence content")

plot_read_content(rc) + xlim(0,50) + labs(title = "Per base sequence content")

2.2. Look for primers

Now, we will look whether the reads still contain the primers. Primer sequences are given in the methods section of the paper.

FWD <- "GTGCCAGCMGCCGCGGTAA"  # 515F primer sequence
REV <- "GGACTACHVGGGTWTCTAAT" # 806R primer sequence

# Function that, from the primer sequence, will return all combinations possible (complement, reverse complement, ...)
allOrients <- function(primer) {
    # Create all orientations of the input sequence
    require(Biostrings)
    dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
    orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
        RevComp = reverseComplement(dna))
    return(sapply(orients, toString))
}

# Get all combinations of the primer sequences
FWD.orients <- allOrients(FWD) # 515F
REV.orients <- allOrients(REV) # 806R
FWD.orients # sanity check
##               Forward            Complement               Reverse 
## "GTGCCAGCMGCCGCGGTAA" "CACGGTCGKCGGCGCCATT" "AATGGCGCCGMCGACCGTG" 
##               RevComp 
## "TTACCGCGGCKGCTGGCAC"
REV.orients 
##                Forward             Complement                Reverse 
## "GGACTACHVGGGTWTCTAAT" "CCTGATGDBCCCAWAGATTA" "TAATCTWTGGGVHCATCAGG" 
##                RevComp 
## "ATTAGAWACCCBDGTAGTCC"
# Function that counts number of reads in which a sequence is found
primerHits <- function(primer, fn) {
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2)
    return(sum(nhits > 0))
}

# Look in all samples if we find the primers
for (i in 1:5){
  cat("SAMPLE", sample.names[i], "with total number of", raw_stats[i,'reads'], "reads\n\n")
  x <-  rbind(ForwardPrimer = sapply(FWD.orients, primerHits, fn = trim_samples[[i]]), 
              ReversePrimer = sapply(REV.orients, primerHits, fn = trim_samples[[i]]))
  print(x)
  cat("\n____________________________________________\n\n")
}
## SAMPLE DN1 with total number of 31424 reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer   21239          0       0       0
## ReversePrimer       0          0       0   14432
## 
## ____________________________________________
## 
## SAMPLE DN10 with total number of 33687 reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer   27440          0       0       0
## ReversePrimer       0          0       0   18160
## 
## ____________________________________________
## 
## SAMPLE DN11 with total number of 41542 reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer   39768          0       0       0
## ReversePrimer       0          0       0   24986
## 
## ____________________________________________
## 
## SAMPLE DN12 with total number of 40803 reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer   38960          0       0       0
## ReversePrimer       0          0       0   25900
## 
## ____________________________________________
## 
## SAMPLE DN13 with total number of 41982 reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer   40741          0       0       0
## ReversePrimer       0          0       0   29953
## 
## ____________________________________________

Let’s have a quick look at where primers are positioned in the reads

# Function that gets position in which sequence is found
primerHitsPosition <- function(primer, fn){
  hits <- as.data.frame(vmatchPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2))
  hits <- hits[,c("group", "start")]
  colnames(hits) <- c("sample", "start")
  hits$sample <- sapply(strsplit(basename(fn), "_"), `[`, 1)
  hits$readslength <- seqTools::fastqq(fn)@maxSeqLen
  return(hits)
}

# Get position of FWD primers
FWDpos <- data.frame()
for(i in 1:5){ # length(trim_samples)
  cat("SAMPLE", i)
  new <- primerHitsPosition(FWD.orients["Forward"], trim_samples[[i]])
  FWDpos <- rbind(new, FWDpos)
}
## SAMPLE 1[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN1_trim.fastq.gz'    done.
## SAMPLE 2[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN10_trim.fastq.gz'   done.
## SAMPLE 3[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN11_trim.fastq.gz'   done.
## SAMPLE 4[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN12_trim.fastq.gz'   done.
## SAMPLE 5[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN13_trim.fastq.gz'   done.
ggplot(FWDpos, aes(x=start))+
  geom_density(aes(y=..scaled..)) +
  xlim(c(0,max(FWDpos$readslength)))+
  labs(x="start position of FWD primer", y="proportion of primers starting at x position")

# Get position of REV primers
REVpos <- data.frame()
for(i in 1:5){ # length(trim_samples)
  cat("SAMPLE", i)
  new <- primerHitsPosition(REV.orients["RevComp"], trim_samples[[i]])
  REVpos <- rbind(new, REVpos)
}
## SAMPLE 1[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN1_trim.fastq.gz'    done.
## SAMPLE 2[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN10_trim.fastq.gz'   done.
## SAMPLE 3[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN11_trim.fastq.gz'   done.
## SAMPLE 4[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN12_trim.fastq.gz'   done.
## SAMPLE 5[fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Nagel-2016/raw_fastq_trimmed/DN13_trim.fastq.gz'   done.
ggplot(REVpos, aes(x=start))+
  geom_density(aes(y=..scaled..)) +
  xlim(c(0,max(REVpos$readslength)))+
  labs(x="start position of REV primer", y="proportion of primers starting at x position")


3. FILTER AND TRIM


3.1. Primer removal

The reads indeed contain both primers (although only about 60% of reads have reverse primer). We will keep only reads containing both primers, and trim the primers. It seems like the reverse primer is found around 300bp, which is consistent with the length of the variable region.

# KEEP READS WITH PRIMER AND REMOVE PRIMER
# Place filtered files in a filtered/ subdirectory
filt1_samples <- file.path(path.data, "filtered1", paste0(sample.names, "_filt1.fastq.gz"))
# Assign names for the filtered fastq.gz files
names(filt1_samples) <- sample.names

# Filter first to take out primer
out1 <- removePrimers(fn = trim_samples, fout = filt1_samples,
                      primer.fwd = FWD.orients['Forward'],
                      primer.rev = REV.orients['RevComp'],
                      trim.fwd = TRUE,
                      trim.rev = TRUE,
                      orient = FALSE, # keep the reads in their original orientation
                      compress = TRUE, verbose = TRUE)
# Primer removal
out1[1:4,]
##                    reads.in reads.out
## DN1_trim.fastq.gz     31424     10193
## DN10_trim.fastq.gz    33687     15175
## DN11_trim.fastq.gz    41542     24197
## DN12_trim.fastq.gz    40803     24942
# Quality profile after primer removal
for (i in 1:4){
  show(plotQualityProfile(filt1_samples[i]))
}

3.2. Quality filtering

Then, we perform a quality filtering of the reads.

# Place filtered files in a filtered/ subdirectory
filt2_samples <- file.path(path.data, "filtered2", paste0(sample.names, "_filt2.fastq.gz"))
# Assign names for the filtered fastq.gz files
names(filt2_samples) <- sample.names

# Filter
out2 <- filterAndTrim(fwd = filt1_samples, filt = filt2_samples,
                      maxEE=3, # reads with more than 3 expected errors (sum(10e(-Q/10))) are discarded
                      truncQ=10, # Truncate reads at the first instance of a quality score less than or equal to truncQ.
                      minLen = 150, # Discard reads shorter than 150 bp.
                      compress=TRUE,
                      multithread=TRUE,
                      verbose=TRUE)

Let’s look at the output filtered fastq files as sanity check.

out2[1:4,] # show how many reads were filtered in each file
##                     reads.in reads.out
## DN1_filt1.fastq.gz     10193      8087
## DN10_filt1.fastq.gz    15175     12070
## DN11_filt1.fastq.gz    24197     20773
## DN12_filt1.fastq.gz    24942     20882
# Look at quality profile of filtered files
for (i in 1:4){
  show(plotQualityProfile(filt2_samples[i]))
}


4. CONSTRUCT ASV TABLE


4.1. Learn error rates

Now we will build the parametric error model, to be able to infer amplicon sequence variants (ASVs) later on.

set.seed(123)
err <- learnErrors(filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)

The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score.

plotErrors(err, nominalQ = TRUE)

As the error rate model doesn’t fit well the observations for high quality scores, we will manually modify it to better fit.

# Modify error rate model
err_new <- err

# G2T
err_new$err_out['G2T','35':'38'] <- 0.0016
err_new$err_out['G2T','38'] <- 0.0014
err_new$err_out['G2T','39'] <- 0.0013
err_new$err_out['G2T','40'] <- 0.00115
err_new$err_out['G2T','41'] <- 0.0010
err_new$err_out['G2T','42'] <- 0.00085
err_new$err_out['G2T','43'] <- 0.0007
err_new$err_out['G2T','44'] <- 0.00055
err_new$err_out['G2T','45'] <- 0.0004

# All the other transitions/transversions
err_new$err_out['A2C','35':'46'] <- 0.0008
err_new$err_out['A2G','38':'46'] <- 0.007
err_new$err_out['A2T','37':'46'] <- 0.002
err_new$err_out['C2A','37':'46'] <- 0.0008
err_new$err_out['C2G','39':'46'] <- 0.0025
err_new$err_out['C2T','38':'46'] <- 0.007
err_new$err_out['G2A','36':'46'] <- 0.0035
err_new$err_out['G2C','36':'46'] <- 0.0006
err_new$err_out['T2A','38':'46'] <- 0.0025
err_new$err_out['T2C','36':'46'] <- 0.0065
err_new$err_out['T2G','36':'46'] <- 0.0025
# Check modified error rate model
plotErrors(err_new, nominalQ = TRUE)

4.2. Infer sample composition

The dada() algorithm infers sequence variants based on estimated errors (previous step). Firstly, we de-replicate the reads in each sample, to reduce the computation time. De-replication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.

# Prepare empty vector for the infered sequences
seq_infered <- vector("list", length(sample.names))
names(seq_infered) <- sample.names # names of each row will be the sample names

# Iterate through the 30 samples
for(sampl in sample.names) {
  cat("Processing:", sampl, "\n")
  derep <- derepFastq(filt2_samples[[sampl]]) # dereplicate the reads in the sample
  seq_infered[[sampl]] <- dada(derep, err=err_new, multithread=TRUE, # default parameters
                               HOMOPOLYMER_GAP_PENALTY=-1, BAND_SIZE=32) # parameters for Ion Torrent recommended
}
# Inspect the infered sequence variants from sample 1:5
for (i in 1:5){
  print(seq_infered[[i]])
  print("________________")
}
## dada-class: object describing DADA2 denoising results
## 72 sequence variants were inferred from 3780 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 32
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 151 sequence variants were inferred from 6122 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 32
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 147 sequence variants were inferred from 7911 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 32
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 189 sequence variants were inferred from 8768 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 32
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 153 sequence variants were inferred from 8981 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 32
## [1] "________________"

4.3. Construct ASV table

We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.

# Make sequence table from the infered sequence variants
seqtable <- makeSequenceTable(seq_infered)

# We should have 30 samples (30 rows)
dim(seqtable)
## [1]   30 1323
# Inspect distribution of sequence lengths
hist(nchar(getSequences(seqtable)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

4.4. Remove chimeras

The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.

seqtable.nochim <- removeBimeraDenovo(seqtable, method="consensus", multithread=TRUE, verbose=TRUE)

# Check how many sequence variants we have after removing chimeras
dim(seqtable.nochim)
## [1]   30 1092
# Check how many reads we have after removing chimeras (we should keep the vast majority of the reads, like > 80%)
sum(seqtable.nochim)/sum(seqtable)
## [1] 0.9758994

5. LOOK AT READS COUNT THROUGH PIPELINE


Sanity check before assigning taxonomy.

# Function that counts nb of reads
getN <- function(x) sum(getUniques(x))

# Table that will count number of reads for each process of interest (input reads, filtered reads, denoised reads, non chimera reads)
track <- cbind(out1,
               out2[,2],
               sapply(seq_infered, getN),
               rowSums(seqtable.nochim),
               lapply(rowSums(seqtable.nochim)*100/out1[,1], as.integer))

# Assign column and row names
colnames(track) <- c("input", "primer-filt", "quality-filt", "denoised", "nonchim", "%input->output")
rownames(track) <- sample.names

# Show final table: for each row/sample, we have shown the initial number of reads, filtered reads, denoised reads, and non chimera reads
track
##      input primer-filt quality-filt denoised nonchim %input->output
## DN1  31424 10193       8087         7890     7819    24            
## DN10 33687 15175       12070        11850    11828   35            
## DN11 41542 24197       20773        20577    20456   49            
## DN12 40803 24942       20882        20471    19964   48            
## DN13 41982 29227       24514        24266    24226   57            
## DN14 42424 24666       20429        20296    18959   44            
## DN15 37990 21802       17476        17252    17149   45            
## DN2  35202 13678       11247        11056    10923   31            
## DN3  35974 16216       13518        13341    13171   36            
## DN4  38369 17413       13694        13519    13379   34            
## DN5  37350 17392       14787        14672    14421   38            
## DN6  37916 18004       15668        15340    14836   39            
## DN7  31704 14910       12296        12188    12188   38            
## DN8  18838 8114        6496         6416     6373    33            
## DN9  41887 18156       14161        14034    13909   33            
## HN1  52361 28770       24065        23848    22807   43            
## HN10 40024 27368       24634        24144    23917   59            
## HN11 37893 23372       21245        21042    20940   55            
## HN12 48034 26205       22944        22782    21863   45            
## HN13 50183 28786       25741        25476    21896   43            
## HN14 50573 26442       20949        20748    20287   40            
## HN15 45578 28139       23727        23468    22289   48            
## HN2  42337 24663       17150        17071    17053   40            
## HN3  43063 25912       23412        23096    22428   52            
## HN4  44553 24358       17246        16941    16624   37            
## HN5  41154 25714       22446        22386    22341   54            
## HN6  34736 11573       6682         6501     6496    18            
## HN7  46448 28456       25118        24741    23963   51            
## HN8  43991 22984       20239        20093    20087   45            
## HN9  44593 24897       21503        21412    21385   47

6. TAXONOMIC TABLE


Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.

path.silva <- file.path(path.root, "data/analysis-individual/CLUSTER/taxonomy/silva-taxonomic-ref")

# Assign taxonomy (with silva v138)
set.seed(123)
taxa <- assignTaxonomy(seqtable.nochim, file.path(path.silva, "silva_nr99_v138.1_train_set.fa.gz"),
                       tryRC = TRUE, # try reverse complement of the sequences
                       multithread=TRUE, verbose = TRUE)

# Add species assignment
set.seed(123)
taxa <- addSpecies(taxa, file.path(path.silva, "silva_species_assignment_v138.1.fa.gz"))
# Check how the taxonomy table looks like
taxa.print <- taxa
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
##      Kingdom    Phylum         Class         Order            
## [1,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"  
## [2,] "Bacteria" "Firmicutes"   "Clostridia"  "Lachnospirales" 
## [3,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"  
## [4,] "Bacteria" "Firmicutes"   "Clostridia"  "Lachnospirales" 
## [5,] "Bacteria" "Firmicutes"   "Clostridia"  "Oscillospirales"
## [6,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"  
##      Family            Genus              Species    
## [1,] "Bacteroidaceae"  "Bacteroides"      "vulgatus" 
## [2,] "Lachnospiraceae" "Blautia"          NA         
## [3,] "Bacteroidaceae"  "Bacteroides"      NA         
## [4,] "Lachnospiraceae" "Agathobacter"     NA         
## [5,] "Ruminococcaceae" "Faecalibacterium" NA         
## [6,] "Bacteroidaceae"  "Bacteroides"      "uniformis"
table(taxa.print[,1], useNA="ifany") # Show the different kingdoms (should be only bacteria)
## 
##  Archaea Bacteria 
##        3     1089
table(taxa.print[,2], useNA="ifany") # Show the different phyla
## 
##  Actinobacteriota      Bacteroidota     Cyanobacteria  Desulfobacterota 
##                34               205                 6                12 
##   Elusimicrobiota     Euryarchaeota        Firmicutes    Fusobacteriota 
##                 1                 2               782                 5 
##    Proteobacteria      Synergistota  Thermoplasmatota Verrucomicrobiota 
##                33                 3                 1                 8

7. LAST PREPROCESSING


We will remove any sample with less than 500 reads from further analysis, and also any ASVs with unassigned phyla.

7.1. Create phyloseq object

The preprocessing will be easier to do with ASV, taxonomic and metadata tables combined in a phyloseq object.

#_________________________
# Import metadata
metadata_table <- read.table(file.path(path.data, "00_Metadata-Nagel/Metadata-Nagel.csv"), header = TRUE, sep = ",", row.names=1)
head(metadata_table)


#_________________________
# Create phyloseq object
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
                   sample_data(metadata_table), 
                   tax_table(taxa))

# Remove taxa that are eukaryota, or have unassigned Phyla
physeq <- subset_taxa(physeq, Kingdom != "Eukaryota")
physeq <- subset_taxa(physeq, !is.na(Phylum))
# Remove samples with less than 500 reads
physeq <- prune_samples(sample_sums(physeq)>=500, physeq)
# No sample was deleted, so we don't need to remove taxa present in low-count samples
# physeq <- prune_taxa(taxa_sums(physeq)>0, physeq)

7.2. Quick peek at data analysis

# Absolute abundance
# plot_bar(physeq, fill = "Phylum")+ facet_wrap("host_disease", scales="free_x") + theme(axis.text.x = element_blank())

# Relative abundance for Phylum
phylum.table <- physeq %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

ggplot(phylum.table, aes(x = Sample, y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(size = 5, angle = -90))+
  labs(x = "Samples", y = "Relative abundance")

7.3. Save to disk

# Save to disk
saveRDS(raw_stats,   file.path(path.data, "01_Dada2-Nagel/raw_stats.rds"))
saveRDS(out1,        file.path(path.data, "01_Dada2-Nagel/out1.rds"))
saveRDS(out2,        file.path(path.data, "01_Dada2-Nagel/out2.rds"))
saveRDS(err,         file.path(path.data, "01_Dada2-Nagel/errorRates.rds"))
saveRDS(err_new,     file.path(path.data, "01_Dada2-Nagel/errorRates_modif.rds"))
saveRDS(seq_infered, file.path(path.data, "01_Dada2-Nagel/seq_infered.rds"))

# Taxa & Phyloseq object
saveRDS(taxa, file.path(path.data, "01_Dada2-Nagel/taxa_nagel.rds"))
saveRDS(physeq, file.path(path.root, "data/analysis-individual/CLUSTER/PhyloTree/input/physeq_nagel.rds"))
saveRDS(physeq, file.path(path.root, "data/phyloseq-objects/phyloseq-without-phylotree/physeq_nagel.rds"))

8. SESSION INFO


sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] stringr_1.5.0               qckitfastq_1.10.0          
##  [3] dplyr_1.1.1                 plyr_1.8.8                 
##  [5] data.table_1.14.8           ggplot2_3.4.2              
##  [7] phyloseq_1.38.0             seqTools_1.28.0            
##  [9] zlibbioc_1.40.0             ShortRead_1.52.0           
## [11] GenomicAlignments_1.30.0    SummarizedExperiment_1.24.0
## [13] Biobase_2.54.0              MatrixGenerics_1.6.0       
## [15] matrixStats_0.63.0          Rsamtools_2.10.0           
## [17] GenomicRanges_1.46.1        BiocParallel_1.28.3        
## [19] Biostrings_2.62.0           GenomeInfoDb_1.30.1        
## [21] XVector_0.34.0              IRanges_2.28.0             
## [23] S4Vectors_0.32.4            BiocGenerics_0.40.0        
## [25] dada2_1.22.0                Rcpp_1.0.10                
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-162           bitops_1.0-7           RColorBrewer_1.1-3    
##  [4] tools_4.1.3            bslib_0.4.2            utf8_1.2.3            
##  [7] R6_2.5.1               vegan_2.6-4            mgcv_1.8-42           
## [10] DBI_1.1.3              colorspace_2.1-0       permute_0.9-7         
## [13] rhdf5filters_1.6.0     ade4_1.7-22            withr_2.5.0           
## [16] tidyselect_1.2.0       compiler_4.1.3         cli_3.6.1             
## [19] DelayedArray_0.20.0    labeling_0.4.2         sass_0.4.5            
## [22] scales_1.2.1           RSeqAn_1.14.0          digest_0.6.31         
## [25] rmarkdown_2.21         jpeg_0.1-10            pkgconfig_2.0.3       
## [28] htmltools_0.5.5        highr_0.10             fastmap_1.1.1         
## [31] rlang_1.1.0            rstudioapi_0.14        farver_2.1.1          
## [34] jquerylib_0.1.4        generics_0.1.3         hwriter_1.3.2.1       
## [37] jsonlite_1.8.4         RCurl_1.98-1.12        magrittr_2.0.3        
## [40] GenomeInfoDbData_1.2.7 biomformat_1.22.0      interp_1.1-4          
## [43] Matrix_1.5-1           munsell_0.5.0          Rhdf5lib_1.16.0       
## [46] fansi_1.0.4            ape_5.7-1              lifecycle_1.0.3       
## [49] stringi_1.7.12         yaml_2.3.7             MASS_7.3-58.3         
## [52] rhdf5_2.38.1           grid_4.1.3             parallel_4.1.3        
## [55] crayon_1.5.2           deldir_1.0-6           lattice_0.20-45       
## [58] splines_4.1.3          multtest_2.50.0        knitr_1.42            
## [61] pillar_1.9.0           igraph_1.4.2           reshape2_1.4.4        
## [64] codetools_0.2-19       glue_1.6.2             evaluate_0.20         
## [67] latticeExtra_0.6-30    RcppParallel_5.1.7     png_0.1-8             
## [70] vctrs_0.6.1            foreach_1.5.2          gtable_0.3.3          
## [73] cachem_1.0.7           xfun_0.38              survival_3.5-5        
## [76] tibble_3.2.1           iterators_1.0.14       cluster_2.1.4